In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from textblob import TextBlob # For sentiment analysis
In [2]:
data = pd.read_csv('netflix_titles.csv')
In [3]:
data.shape
Out[3]:
(8807, 12)
In [4]:
data.columns
Out[4]:
Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')
In [5]:
data.head(2)
Out[5]:
See Full Dataframe in Mito
show_id type title director cast country date_added release_year rating duration listed_in description
0 s1 Movie Dick Johnson Is Dead Kirsten Johnson NaN United States September 25, 2021 2020 PG-13 90 min Documentaries As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.
1 s2 TV Show Blood & Water NaN Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng South Africa September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, TV Dramas, TV Mysteries After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.

Lokking at the distribution of content rating on netflix¶

In [6]:
d = data.groupby(['rating']).size().reset_index(name='counts')
In [7]:
piechart = px.pie(d, values = 'counts', names='rating', title='Distribution of rating', color_discrete_sequence=px.colors.qualitative.Set3)
piechart.show()

Top 5 Actors and Directors¶

In [8]:
data['director']=data['director'].fillna('No Director Available')
In [9]:
filtered_director = pd.DataFrame()
In [10]:
filtered_director = data['director'].str.split(',', expand=True).stack()
filtered_director.head(9)
Out[10]:
0  0          Kirsten Johnson
1  0    No Director Available
2  0          Julien Leclercq
3  0    No Director Available
4  0    No Director Available
5  0            Mike Flanagan
6  0            Robert Cullen
   1           José Luis Ucha
7  0             Haile Gerima
dtype: object
In [11]:
filtered_director = filtered_director.to_frame()
In [12]:
filtered_director.columns = ['Director']
In [13]:
directors = filtered_director.groupby(['Director']).size().reset_index(name='Total Content')
directors.head()
Out[13]:
See Full Dataframe in Mito
Director Total Content
0 Aaron Moorhead 2
1 Aaron Woolf 1
2 Abbas Alibhai Burmawalla 1
3 Abdullah Al Noor 1
4 Abhinav Shiv Tiwari 1
In [14]:
directors=directors[directors.Director != 'No Director Available']
In [15]:
directors = directors.sort_values(by=['Total Content'], ascending=False)
In [16]:
directors_top_5 = directors.head()
directors_top_5
Out[16]:
See Full Dataframe in Mito
Director Total Content
4021 Rajiv Chilaka 22
4068 Raúl Campos 18
261 Jan Suter 18
4652 Suhas Kadav 16
3235 Marcus Raboy 16
In [17]:
directors_top_5 = directors_top_5.sort_values(by=['Total Content'])
directors_top_5
Out[17]:
See Full Dataframe in Mito
Director Total Content
4652 Suhas Kadav 16
3235 Marcus Raboy 16
4068 Raúl Campos 18
261 Jan Suter 18
4021 Rajiv Chilaka 22

List of the Top 5 Directors on Netflix¶

In [18]:
fig_1 = px.bar(directors_top_5, x='Total Content', y='Director', title='Top 5 Directors on Netflix')
fig_1.show()

Check the top 5 Actors¶

In [19]:
data['cast']=data['cast'].fillna('No Cast Present')
In [20]:
filtered_cast = pd.DataFrame()
In [21]:
filtered_cast = data['cast'].str.split(',', expand=True).stack()
In [22]:
filtered_cast.head()
Out[22]:
0  0    No Cast Present
1  0         Ama Qamata
   1        Khosi Ngema
   2      Gail Mabalane
   3     Thabang Molaba
dtype: object
In [23]:
filtered_cast = filtered_cast.to_frame()
In [24]:
filtered_cast.columns = ['Actor']
In [25]:
actors = filtered_cast.groupby(by=['Actor']).size().reset_index(name='Total Content')
actors.head()
Out[25]:
See Full Dataframe in Mito
Actor Total Content
0 Jr. 2
1 "Riley" Lakdhar Dridi 1
2 'Najite Dede 1
3 2 Chainz 1
4 2Mex 1
In [84]:
actors = actors[actors.Actor != 'No Cast Present']
In [85]:
actors=actors.sort_values(by=['Total Content'], ascending=False)
actors.head()
Out[85]:
See Full Dataframe in Mito
Actor Total Content
2612 Anupam Kher 39
26941 Rupa Bhimani 31
30303 Takahiro Sakurai 30
15541 Julie Tejwani 28
23624 Om Puri 27
In [87]:
actors_top_5 = actors.head()
In [92]:
actors_top_5 = actors_top_5.sort_values(by=['Total Content'], ascending=True)
actors_top_5
Out[92]:
See Full Dataframe in Mito
Actor Total Content
23624 Om Puri 27
15541 Julie Tejwani 28
30303 Takahiro Sakurai 30
26941 Rupa Bhimani 31
2612 Anupam Kher 39

These are the Top 5 Actors on Netflix¶

In [93]:
fig_2 = px.bar(actors_top_5, x='Total Content', y='Actor', title='Top 5 Actors on Netflix')
fig_2

Analyzing the Content on Netflix over the Years¶

In [31]:
df1= data[['type', 'release_year']]
In [32]:
df1.isnull().sum()
Out[32]:
type            0
release_year    0
dtype: int64
In [33]:
df2 = df1.groupby(['release_year', 'type']).size().reset_index(name='Total Content')
df2.head()
Out[33]:
See Full Dataframe in Mito
release_year type Total Content
0 1925 TV Show 1
1 1942 Movie 2
2 1943 Movie 3
3 1944 Movie 3
4 1945 Movie 3
In [34]:
df2 = df2[df2['release_year'] > 2010]

Trend of Content Produced over the Years on Netflix¶

In [35]:
fig_3 = px.line(df2, x='release_year', y='Total Content', title='Trend of Content Produced in the Years', color='type')
fig_3

Sentiment analysis of the content on netflix¶

In [36]:
df_1 = data[['release_year', 'description']]
In [37]:
df_1 = df_1.rename(columns={'release_year':'Release Year'})
In [38]:
for index, row in df_1.iterrows():
    d=row['description']
    testimonial=TextBlob(d)
    s=testimonial.sentiment.polarity
    if s==0:
        sent = 'Neutral'
    elif s>0:
        sent = 'Positive'
    else:
        sent = 'Negative'
    df_1.loc[[index, 2], 'Sentiment']=sent

df_1 = df_1.groupby(['Release Year', 'Sentiment']).size().reset_index(name='Total Content')

df_1 = df_1[df_1['Release Year'] >= 2010]
fig_4 = px.bar(df_1, x='Release Year', y='Total Content', color='Sentiment', title='Sentiment Content on Netflix')
fig_4
In [82]:
!jupyter nbconvert --to html DA_portfolio_project_11.ipynb
[NbConvertApp] Converting notebook DA_portfolio_project_11.ipynb to html
[NbConvertApp] Writing 4384392 bytes to DA_portfolio_project_11.html
In [ ]: